rm(list = ls())
cat("\014")

## set up some global options
# always set stringsAsFactors = F when loading data
options(stringsAsFactors=FALSE)

# show the code
knitr::opts_chunk$set(echo = TRUE)

# define all knitr tables to be html format
options(knitr.table.format = 'html')

# change code chunk default to not show warnings or messages
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

libs <- c("tidyverse", "knitr", "readxl", "dismo", "leaflet", "kableExtra", "farmr")
#"NbClust", "clValid", "ggfortify", "clustree", "dendextend", "FactoMineR", "corrplot", "GGally"
lapply(libs, require, character.only = T, quietly = T, warn.conflicts = F)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.0  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.2       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Warning: package 'readxl' was built under R version 3.4.4
## Warning: package 'sp' was built under R version 3.4.4
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:tidyr':
## 
##     extract
## Warning: package 'leaflet' was built under R version 3.4.4
## Warning: package 'kableExtra' was built under R version 3.4.4
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE
## 
## [[5]]
## [1] TRUE
## 
## [[6]]
## [1] TRUE
## 
## [[7]]
## [1] TRUE
#### define helpful functions
# define function to adjust table widths
html_table_width <- function(kable_output, width) {
  width_html <- paste0(paste0('<col width="', width, '">'), collapse = "\n")
  sub("<table>", paste0("<table>\n", width_html), kable_output)
}

source("../../oaflib/commcareExport.R")
## 
## Attaching package: 'curl'
## The following object is masked from 'package:readr':
## 
##     parse_date
## Warning: package 'jsonlite' was built under R version 3.4.4
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
source("../../oaflib/misc.R")
source("../../oaflib/plm.R")
select <- dplyr::select

forceUpdateAll <- FALSE

1 Background

This file will pick up on from the Kenya round 1 file and finish the analysis. The cleaning of the data up through the round 1 data (baseline and round 1) happens in the kenya_round_1 folder. Going forward, the cleaning of new data will happen in the shs_cleaning.R file, which I’ll source here, and then the analysis will happen here.

2 Objective

Analyze the soil health study data.

3 Data

# load the latest kenya data from the cleaning file >> or maybe do the combining there.
#source("shs_cleaning.R")

keDat <- readRDS("ke_cleaned_combined_fieldDat.rds")
rwDat <- readRDS("rw_cleaned_combined_fieldDat.rds")

4 Analysis

4.1 Check identifiers

It’s hard to know where to draw the line between cleaning and analysis. I’m going to keep this code here even though it’s technically additional data cleaning

Check that we’re actually dealing with clients for which we have multiple records

table(table(rwDat$sample_id)==5) 
## 
## FALSE  TRUE 
##   157  2328

So we have some clients for which we don’t have 3 records, one for each survey. Find out what the deal is.

table(table(rwDat$sample_id))
## 
##    1    2    3    4    5 
##    1    6  110   40 2328

Hm. So the numbers less than 5 could be that we’re just not finding people every year. But the 1s and 2s are particularly strange. Let’s check those out. Here’s a helpful link

singleSurvey <- names(which(table(rwDat$sample_id) == 1))

rwDat %>%
  filter(sample_id == singleSurvey)

Looks like this person was only surveyed in the first season. Okay. Check out the other seasons

twoSurveys <- names(which(table(rwDat$sample_id) == 2))
rwDat %>%
  filter(sample_id %in% twoSurveys)

Okay, these are from the recent round but they don’t have any previous surveys which is odd. We shouldn’t be adding new farmers into the survey. These can be dropped for simplicity. I can follow up with Eric and Cyprien to find out the issue but I’m just going to drop them for now.

rwDat <- rwDat %>%
  filter(!sample_id %in% twoSurveys)
fourSurveys <- names(which(table(rwDat$sample_id)==4))
rwDat %>%
  filter(sample_id %in% fourSurveys)

Okay. There are enough surveys in the 3 and 4 category that they’re at least plausible from a data quality and survey strategy perspective. I’ll leave them as they are.

4.2 Surveys for which we don’t have soil

Check that there isn’t something systematic in these surveys. If so, describe what it is.

4.2.1 Attrition analysis for appendix

Create a record of how many farmers are joining and leaving Tubura between the baseline through the current survey round. These numbers only reflect the changes from the last round.

This also should account for clients that we miss in each season so we can say what the attrition has been season to season. Also, why don’t we have 16A results for Rwanda. Find out what’s happening there.

clientCount <- function(dat){
  # this assumes season, d_client, and sample_id remain the names of 
  # the key variables. They should for the data to match.
  
  # this funciton tablulates the number of clients in each season
  iniTab <- dat %>%
    filter(!is.na(d_client)) %>%
    group_by(season, d_client) %>%
    tally() %>%
    gather(key, value, -c(season, d_client)) %>%
    spread(d_client, value) %>%
    rename(control = `0`,
         client = `1`) %>%
    mutate(control = as.numeric(control),
         client = as.numeric(client)) %>%
    rowwise() %>%
    mutate(total = sum(control, client)) %>%
    ungroup()
    
  percentDiff <- iniTab %>%
    mutate(pct_change = (((total - lead(total))/total) * 100))
  return(percentDiff)    
  
}

rw.attrition.tab <- clientCount(rwDat)

ke.attrition.tab <- clientCount(keDat)
attritionRate <- function(org, new){
  return((org - new)/org * 100)
}


attritionRate(2028, 1935)  # kenya
## [1] 4.585799
attritionRate(2439, 2409) # rwanda
## [1] 1.230012
attritionRate(4467, 4344) # total
## [1] 2.753526

4.3 Soil graphs

ggplot(rwDat, aes(x = season, y = ph, group = d_client)) + geom_boxplot()

4.3.1 Variable check

ggplot(keDat, aes(x = can.acre)) + 
  geom_histogram() +
  scale_x_continuous(limits = c(0,200)) +
  labs(title = "Set limits to 0 and 100 for CAN")

ggplot(keDat, aes(x = dap.acre)) + 
  geom_histogram() +
  scale_x_continuous(limits = c(0,200)) +
  labs(title = "Set limits to 0 and 100 for DAP")

#export data for Diego >> this is the easiest way to have the location and soil information together. The soil data is also available through the warehouse but it isn't easily connected to the survey information.

rwDat %>%
  select(sample_id, season, district, cell_field, village, ph, calcium, magnesium, organic.carbon, total.nitrogen) %>%
  write_csv(., "output/rwanda_soil_values_travertine_impact.csv")

5 Digging into data

Check values for upcoming final round of Kenya SHS data collection

This is ideally a grouped boxplot. Fix this when I have internet. This would show the values by season and client type.

See sketch of SHS report. Remember that sameStatus are the farmers that kept their status between baseline and endline. The two models of interest are:

  • Individual fixed effects account for things specific to farmer that don’t change over time
  • can control for unobserved sources of heterogeneity over time, very sensitive to model
  • add in other data points that do change over time
  • so add in things that change over time that plausibly affect our outcome
  • fertilizer and seed use are synonymous with being a client or not, highly endogenous
  • run two regs
  • one with oaf
  • one with oaf and fertilizer
  • things like slope are collinear
  • individual fixed effects makes more sense than using PSM now that we have multiple years.
  • means by directional changes
  • papers using fixed effects by Miguel on whether changes to rural to urban areas and income

Helpful link for executing code in parallel

5.1 Nitrogen effect overview

Work before 3/18/19

Before running the full models, I’m going to take a closer look at the nitrogen data to make sure we can explain the negative effects we’re seeing in Kenya in the models below. There isn’t a clear agronomic explanation for the negative effect and as a consequence we’re put in a bind in our reporting to explain either the outcome or the modeling.

Let’s start with looking at the relationship between nitrogen and other features to make sure we’re seeing what expect. Here are the notes from Step:

  • We might expect that increased application of fertilizer N by our clients could build up organic N because some of the fertilizer N applied would be retained by microbial immobilisation.
  • Carbon hasn’t decreased, so I don’t think we can attribute loss of TSN to loss of SOM.
  • A possible explanation is that we have increased mineralization of N by virtue of farmers in our program increasing input of N relative to input of C (compared to control farmers), ultimately resulting in a reduction in organic N through crop uptake, leaching, or denitrification.
    • So takeaway here would be that we need to increase C inputs, rather than N inputs >> we should be able to check this (at least roughly), by looking at farmer reported OM inputs to the fields?

Click here for compost / acre vs. nitrogen rate effects. The short is that there isn’t a clear trend.

  • I think increased crop uptake is more likely than leaching or denitrification - If there was more N leaching, we’d expect to see lower pH (and lime adoption amongst clients was <1% at the time we did sampling, and application rates are in any case very low - 500 kg/ha)
  • Our farmers mostly follow our trainings on deep placement micro-dosing, so hopefully denitrification risk is mitigated pretty well.
  • Relatedly, it’s possible that that crop uptake is exceeding fertilizer application anyway i.e. we have a nutrient mining problem.
  • For yields of 3.9 t/ha (mean for our clients in seasons 2014-17), N removal would be ~70 kg/ha, but our current recommendation (aligned with govt) is ~55 kg/ha (50kg/acre DAP + 50kg/acre CAN).

Nitrogen application by yield output. The yield values are not making sense between survey rounds. I’ll come back to this.

  • Conclusion here would be that we need to increase N inputs (though get just trying to plug the N ‘gap’ in the form of mineral fertilizer doesn’t necessarily result in a balanced nutrient budget e.g. N deposition might supply ~5-10 kg N/ha anyway). I suspect this is the case anyway, because our N rates do seem too low, especially in ratio to our P rates. However, this would only be driving a reduction relative to non-clients if our n application relative to yields achieved is lower. Finally, maybe our farmers are rotating less with legumes than non-farmers, or rotating with legumes that are less prolific in fixing N.

Look at rotations relative to N rate by 1AF status. It does appear that plots that non-1AF plots receive more legume intercrops. This makes sense given that the 1AF core package emphasizes maize monocrops.

Would love to hear your thoughts - entirely possible I’m missing something here. The most critical question here is whether (i) we’re not applying enough C relative to N, (ii) we’re just not applying enough N. I suspect maybe the answer is both, but I’m wary of assuming it’s the latter and making things worse if the key driver is unbalanced C:N ratios.

3/18/19 forward

Step Aston of ART wants follow up on the possibility that 1AF is driving down N rates through repeated 1AF culti ation. This document lists the key questions we want to answer to understand what might be happening in the data. Those questions are (see document for more specifics):

  • Are there spatial patterns to the N reduction effect - is this a genuine effect across all program areas, or specific only to certain areas?
  • Are there differences between control and treatment in how long fields have been farmed for?
  • Are control farmers applying a higher ratio of N compared to C than control farmers?
  • Is N application vs N uptake ratio different for treatment vs control?
  • Are our farmers rotating or intercropping with legumes less than non-farmers, or with legumes that are less prolific N-fixers? (i.e. is quality of organic inputs changing to a wider C:N ratio)

5.2 Nitrogen alone

The big takeaway for me from this plot is that there are regular spikes at certain values. That seems odd for a system that should be pretty continuous. Let’s see if that plays out evenly across seasons.

keDat %>%
  ggplot(., aes(x = total.nitrogen)) + 
  geom_histogram(bins = 100) +
  geom_density()

keDat %>%
  ggplot(., aes(x = total.nitrogen)) + 
  geom_histogram() +
  geom_vline(xintercept = mean(keDat$total.nitrogen, na.rm = T)) +
  facet_grid(~ season)

# ggplot() +
#   geom_histogram(data = filter(keDat, keDat$season == 2015), aes(x = total.nitrogen, fill = 'red'), alpha = 0.5) +
#   geom_histogram(data = filter(keDat, keDat$season == 2016), aes(x = total.nitrogen, fill = 'blue'), alpha = 0.5) +
#   geom_histogram(data = filter(keDat, keDat$season == 2017), aes(x = total.nitrogen, fill = 'green'), alpha = 0.5) +
#   theme(legend.position = 'none')

5.3 Soil N by season

This is what we’re observing. There’s a clear downward trend across seasons with perhaps a slight shift downward for

keDat %>%
  #select(season, total.nitrogen, d_client) %>%
  ggplot(., aes(x = season, y = total.nitrogen, fill = as.factor(d_client))) + 
  geom_boxplot() + 
  theme(legend.position = 'bottom') +
  labs(title = "Total N by season and client status", subtitle ="Strong seasonal trend with perhaps a slight client trend downward",
       x = "Season", y = "Total N", fill = "Client Status")

keDat %>%
  filter(!is.na(d_client)) %>%
  group_by(season, d_client) %>% 
  summarize(mean = round(mean(total.nitrogen, na.rm = T),4)) %>%
  kable(caption = "Downward average N by season") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Downward average N by season
season d_client mean
2015 0 0.1577
2015 1 0.1579
2016 0 0.1499
2016 1 0.1484
2017 0 0.1478
2017 1 0.1466

1AF clients are slightly below comparison farmers in terms of total nitrogren but the difference is ever so slight. Perhaps in the context of soil chemistry these are big differencs though!

5.4 Nitrogen and carbon

This code also addresses identified erroneous variables.

keDat <- keDat %>%
  mutate(organic.carbon = ifelse(organic.carbon == 0 & total.nitrogen > 0, NA, organic.carbon))
keDat %>%
  ggplot(., aes(x = organic.carbon, y = total.nitrogen, color = as.factor(d_client))) + 
  geom_point() +
  labs(title = "N vs. C - what we'd expect", x = "Total Carbon", y = "Total Nitrogen",
       subtitle = "And no clear client, non-client trend", color = "1AF client")

5.5 Soil C vs. soil N by season

keDat %>%
  filter(!is.na(d_client)) %>%
  ggplot(., aes(x = organic.carbon, y = total.nitrogen, color = as.factor(d_client))) + 
  geom_point() +
  labs(title = "N vs. C by season - again what we'd expect", x = "Organic C", y = "Total N", color = "1AF client",
       subtitle = "and no clear client, non-client trend") +
  facet_grid(~ season)

This is the same table as above but shows carbon level by season and client status. We don’t see the same gap widening between client and non-client plots that we see with nitrogen.

keDat %>%
  filter(!is.na(d_client)) %>%
  group_by(season, d_client) %>% 
  summarize(mean = round(mean(organic.carbon, na.rm = T),4)) %>%
  kable(caption = "Carbon levels trend together by client status over seasons") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Carbon levels trend together by client status over seasons
season d_client mean
2015 0 2.1606
2015 1 2.1700
2016 0 1.8588
2016 1 1.8506
2017 0 1.8625
2017 1 1.8632

5.6 Applied carbon vs. soil nitrogen

if you applied compost since there are so many 0s and removing super large values

There’s no clear visual trend between compost application and nitrogen rates.

keDat %>%
  filter(compost.acre > 0 & compost.acre < 1000) %>%
  ggplot(., aes(x = compost.acre, y = total.nitrogen, color = as.factor(d_client))) +
  geom_point() +
  facet_grid(~ season) +
  labs(title = "N vs. compost", subtitle = "Relationship less clear", x = "Compost kg / acre", y = "")

5.7 Applied N vs. soil nitrogen

# dap 18% n, can 26%
keDat %>%
  mutate(can_n = can.acre * 0.26,
         dap_n = dap.acre * 0.18) %>%
  rowwise() %>%
  mutate(total_n_applied = sum(can_n, dap_n, na.rm = T)) %>%
  filter(total_n_applied > 0) %>%
  ggplot(., aes(x = total_n_applied, y = total.nitrogen, color = as.factor(d_client))) +
  geom_point() +
  facet_wrap(~ season + d_client)

Interpretation: There doesn’t seem to be a strong relationship between total N applied and total nitrogen in the soil. It’s not clear to me what we should have expected from this relationship but in general I don’t see anything odd or a clear difference between the 1AF plots and the treatment plots.

5.8 Applied N vs. applied C by season

unique_seasons <- unique(keDat$season)

nitrogen_v_compost <- lapply(unique_seasons, function(year){
  return(keDat %>%
    filter(season == year) %>%
    mutate(can_n = can.acre * 0.26,
           dap_n = dap.acre * 0.18) %>%
    rowwise() %>%
    mutate(total_n_applied = sum(can_n, dap_n, na.rm = T)) %>%
    filter(total_n_applied > 0)) %>%
    filter(compost.acre < 2000 & compost.acre > 0)
})

lapply(nitrogen_v_compost, function(x){
  ggplot(x, aes(x = total_n_applied, y = compost.acre, color = as.factor(d_client))) +
    geom_point() +
    #facet_wrap(~ d_client, scales = 'free') +
    geom_smooth(method = 'lm') +
    labs(title = paste("Total N applied / acre vs. compost / acre in", unique(x$season)),
         subtitle = "Compost / acre less than 2000 kgs",
         x = "Total kg N / acre",
         y = "Total kg compost / acre")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

Interpretation I don’t see a strong difference across seasons between total N applied and total compost applied. The trend lines seem generally the same or at least visually not so strong to suggest that 1AF plots and control plots are having different experiences. The 2017 plot indicates a slightly flatter relationship between applied N and applied C on 1AF plots but this is not super dramatic.

5.9 Applied C vs. soil N by season

applied_c_vs_soil_n <- lapply(unique_seasons, function(year){
  return(keDat %>%
    filter(season == year) %>%
    filter(compost.acre < 2000 & compost.acre > 0))
})

lapply(applied_c_vs_soil_n, function(x){
  ggplot(x, aes(x = compost.acre, y = total.nitrogen, color = as.factor(d_client))) +
      geom_point() +
      geom_smooth(method = 'lm') +
      labs(title = paste("Applied C vs. soil N in", unique(x$season)),
           subtitle = "No clear difference",
           x = "Total kg compost / acre",
           y = "Total soil N (%))")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

Interpretation As with the applied N vs. applied C, I don’t see a clear difference in outcome between 1AF and control plots when we look at applied C and soil N.

5.10 Legume rotation and nitrogen levels

intercrop_check <- keDat %>%
  mutate(legume_intercrop = ifelse(intercrop %in% c("beans", "groundnuts", "soya"), 1, 0)) %>%
  filter(!is.na(sample_id))

intercrop_check %>%
  group_by(sample_id, d_client) %>%
  summarize(legume_count = sum(legume_intercrop, na.rm = T),
            total.nitrogen.average = mean(total.nitrogen, na.rm = T)) %>%
  filter(legume_count <= 3) %>%
  ggplot(., aes(x = as.factor(legume_count), y = total.nitrogen.average, fill = as.factor(d_client))) +
  geom_boxplot() +
  labs(title = "Legume intercropping (beans, groundnuts, soya) and nitrogen",
       subtitle = "Control plots have slightly higher N across years of legume intercrops",
       x = "Number of seasons of legume intercrops", 
       y = "Total soil N (%)", fill = "1AF status") + 
  theme_oaf()

The above plot shows N rates by the number of legume intercrops. It’s not a perfect view of what happens following a season in which a farmer rotates to beans but does show the cumulative benefit of additional seasons of a legume intercrop on the plot. Do 1AF farmers have fewer intercrops? This is hard to precisely pin down since the plots can shift from 1AF cultivation to not-1AF cultivation season to season. Therefore I’ll look at the average years of 1AF cultivation vs. legume intercrops

5.11 Legume intercrops by 1AF tenure

intercrop_check %>%
  group_by(sample_id) %>%
  summarize(years_oaf = sum(d_client, na.rm = T),
            number_intercrops = sum(legume_intercrop, na.rm = T)) %>%
  group_by(years_oaf, number_intercrops) %>%
  tally() %>%
  filter(years_oaf <= 3) %>%
  filter(number_intercrops < 4) %>%
  ggplot(., aes(x = as.factor(years_oaf), y = n, fill = as.factor(number_intercrops))) +
  geom_bar(stat = 'identity', position = 'dodge') +
  labs(title = "Legume intercrops by 1AF tenure", x = "Years of cultivation under 1AF", y = "Count of plots",
       subtitle = "") +
  theme(legend.position = 'bottom') +
  theme_oaf()

We’re looking for an indication that 1AF plots have fewer legume intercrops and it does appear that plots that have not been cultivated with 1AF have more years of legume intercropping (the blue bar on the left) that for plots that are 1AF plots for more seasons. This relationship is not necessarily statistically different as we also see years of numerous legume intercrops for 1AF plots as well. The main distinguishing feature however is the big bar indicating that plots that have never been 1AF plots have had 3 legume intercrops more than 1AF plots.

5.12 Models of legume intercropping and soil N

library(broom)

legume_model_data <- intercrop_check %>%
  group_by(sample_id) %>%
  summarize(years_oaf = sum(d_client, na.rm = T),
            number_intercrops = sum(legume_intercrop, na.rm = T),
            average_soil_nitrogen = mean(total.nitrogen, na.rm = T))


# plot these data to understand relationship.
legume_model_data %>%
  filter(number_intercrops < 4) %>%
  ggplot(., aes(x = as.factor(years_oaf), y = average_soil_nitrogen, fill = as.factor(number_intercrops))) +
  geom_boxplot() +
  theme_oaf() +
  labs(title = "Soil N by 1AF seasons and legume cultivations",
       x = "1AF seasons",
       y = "Average soil N",
       fill = "Legume cultivations")

cat("R squard for simple model:", summary(lm(average_soil_nitrogen ~ as.factor(number_intercrops), data = legume_model_data))$r.squared)
## R squard for simple model: 0.0056116
cat("R squard for model including years_oaf:", summary(lm(average_soil_nitrogen ~ as.factor(number_intercrops) + years_oaf, data = legume_model_data))$r.squared)
## R squard for model including years_oaf: 0.005687464

The r2 of models looking at the simple relationship of years of legume cultivation and soil nitrogen is very low. That doesn’t seem to be capturing the movement of soil nitrogen.

5.13 Legume selection by 1AF status

intercrop_check %>%
  filter(legume_intercrop == 1) %>%
  group_by(intercrop, d_client) %>%
  tally() %>%
  ggplot(., aes(x = intercrop, y = n, fill = as.factor(d_client))) +
  geom_bar(stat = 'identity', position = 'dodge') +
  labs(title = "Legume intercrop type by 1AF status",
       subtitle = "It's mostly beans, folks and control plots are intercropped a lot more regularly",
       x = "Intercrop type",
       y = "Number of plots", 
       fill = "1AF status") + 
  theme_oaf()

Interpretation: About 400 more control plots were planted with a bean intercrop over the course of the study. However, we don’t see additional seasons of legume intercrop explaining soil N levels in the data.

Does increased intercropping explain the difference between 1AF and control plots? Do the other pieces of evidence line up to support this?

intercrop_check %>%
  filter(legume_intercrop == 1) %>%
  group_by(intercrop, d_client) %>%
  tally() %>%
  mutate(d_client = ifelse(d_client == 0, "control", "one-acre")) %>%
  spread(d_client, n) %>%
  kable(., caption = "Total plots by intercrop type") %>%
  kable_styling()
Total plots by intercrop type
intercrop control one-acre
beans 1835 1452
groundnuts 57 27
soya 15 6

Control plots were intercropped with beans about ~400 times more than 1AF plots which is not nothing.

intercrop_check %>%
  filter(legume_intercrop == 1) %>%
  group_by(intercrop, d_client) %>%
  tally() %>%
  ungroup() %>%
  mutate(percent = paste(round(n / sum(n), 2) * 100, "%")) %>%
  select(-n) %>%
  mutate(d_client = ifelse(d_client == 0, "control", "one-acre")) %>%
  spread(d_client, percent) %>%
  kable(., caption = "Percent of plots by intercrop") %>%
  kable_styling()
Percent of plots by intercrop
intercrop control one-acre
beans 54 % 43 %
groundnuts 2 % 1 %
soya 0 % 0 %

5.14 Spatial trend in N levels

5.14.1 Boxplots by geography

Clean up region variable

keDat$region <- ifelse(grepl("west", tolower(keDat$region)), "Western", "Nyanza")

I want to create a metric of how different treatment values are from control values by district. I’m going to subtract the control average at the district level from each treatment value for each season. I’ll then plot them individually so it’s easier to see.

control_nitrogen_level <- function(year){
  return(keDat %>%
  filter(!is.na(district)) %>%
  filter(season == year) %>%
  group_by(district) %>%
  mutate(control_nitrogen_average = mean(total.nitrogen[d_client == 0], na.rm = T)) %>%
  ungroup() %>%
  mutate(difference_nitrogen = total.nitrogen - control_nitrogen_average) %>%
  filter(d_client == 1))
}

nitrogen_plots <- lapply(c(2015, 2016, 2017), function(year){
  control_nitrogen_level(year)
})
 

lapply(nitrogen_plots, function(x){
  ggplot(x, aes(x = district, y = difference_nitrogen, fill = region)) + 
  geom_boxplot() +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = paste("Difference in nitrogen ", unique(x$season)),
       subtitle = "I don't see a clear spatial trend across locations or time") +
    theme_oaf()
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

Quickly, let’s output graphs for each district:

unique_districts <- unique(keDat$district)[!unique(keDat$district) %in% NA]

lapply(unique_districts, function(dist){
  keDat %>%
  filter(!is.na(district)) %>%
  filter(district == dist) %>%
  ggplot(., aes(x = season, y = total.nitrogen, fill = as.factor(d_client))) +
    geom_boxplot() +
    labs(title = paste("Seasonal trend for", dist),
         x = "Season",
         y = "Total soil N (%)") +
    theme_oaf()
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

When we look at this by district over time and 1AF plot status we definitely see a downward trend in some districts. We also see that in some cases, like Butere, the 1AF plots started well below controls. In ohters, like Suneka, we see the 1AF drop dramatically which is pretty difficult to explain. A couple other observations:

  • We see that the 2015 values are much higher than the next values. This is true for 1AF and control plots so that’s consistent but that highlights some in consistencies in our soil model measurements because we’re not actually losing that much N one season to the next.
  • KKM N is an interesting example for how we see a shift in 1AF plots to being below control plots. We have to remember that what is considered 1AF vs. control is farmer determined.

Farmers could be shifting plots with less N into 1AF cultivation and moving other plots out of 1AF rotation. Let’s check out how these trends look if we use the baseline assignment as our indicator for 1AF / control status.

When we look at the data based on the 2015 baseline treatment status, the difference between 1AF and control plots isn’t as extreme. The analytical model we’re using below should be taking into acocunt the effect of clients potentially switching their plot back and forth.

keDat %>%
  ggplot(., aes(x = region, y = total.nitrogen)) +
  geom_boxplot() + 
  facet_grid(~ season)

keDat %>%
  filter(!is.na(district)) %>%
  ggplot(., aes(x = district, y = total.nitrogen)) + 
  geom_boxplot() +
  facet_grid(season ~ .) +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1))

5.14.2 Maps - tbd

Making maps will be more involved so I’m hoping the boxplots are enough.

5.15 Nitrogen uptake

Assess whether ratio of N application rates vs N uptake rates is different between treatment and control. We’d have to make some assumptions about

  1. Yield (drawing from M&E harvest surveys? - or do we get some farmer recall on yield in the SHS survey itself?)
  2. N uptake per unit yield (drawn from IPNI nutrient removal calculator)
  3. N applied as compost/manure (I’m not clear on how much detail we have for Kenya on whether it’s manure vs compost, source etc?)

5.15.1 Yield and N application

Quick histogram of farmer estimated yields

ggplot(keDat, aes(x = yield.t.ha)) +
  geom_histogram(binwidth = 0.25) + 
  scale_x_continuous(limit = c(0, 20)) +
  labs(title = "Histogram of farmer est. yields (less than 20 t/ha)",
       subtitle = "Esitmates are very noisy. 75th percentile is 15 t/ha")

#kable(quantile(keDat$yield.t.ha, na.rm = T), caption = "Most observations are less than 15 t/ha which is still really high")
applied_n_vs_yield <- lapply(unique_seasons, function(year){
  return(
    keDat %>%
      filter(season == year) %>%
      mutate(can_n = can.acre * 0.26,
             dap_n = dap.acre * 0.18) %>%
      rowwise() %>%
      mutate(total_n_applied = sum(can_n, dap_n, na.rm = T)) %>%
      mutate(total_n_applied_ha = total_n_applied * 2.47) %>%
      filter(total_n_applied > 0) %>%
      filter(yield.t.ha < 10)) %>%
    as.data.frame()
})

lapply(applied_n_vs_yield, function(x){
  ggplot(x, aes(x = total_n_applied_ha, y = yield.t.ha, color = as.factor(d_client))) +
    geom_point() +
    facet_wrap(~ season, scales = 'free') + 
    geom_smooth(method = 'lm') +
    labs(title = paste("Applied N vs. farmer estimated yield", unique(x$season)),
         subtitle = "Limiting farmer est. yield to < 10 t/ha - Trend lines are messy but suggest no difference",
         x = "Total kg N / ha ",
         y = "Yield t/ha",
         fill = "1AF status")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

5.15.2 Yield by compost type

N applied as manure instead of plant material. Let’s look at compost by type by client status by season

keDat %>%
  mutate(compost_category = ifelse(grepl("pig|goat|cow|chicken", compost.type), "animal", "plant")) %>%
  filter(compost.kgs > 0) %>%
  filter(compost.acre < 2000) %>%
  ggplot(., aes(x = as.factor(compost_category), y = compost.acre, fill = as.factor(d_client))) + 
  geom_boxplot() +
  facet_wrap(~ season) +
  labs(title = "Compost application by type (excluding those that didn't apply)",
       subtitle = "1AF plots are getting more animal manure when compost is applied",
       x = "Compost type - animal or plant (plant residue / kitchen)",
       y = "Total kg compost / acre",
       fill = "1AF status")

Interpretation If anything this suggests that 1AF plots are getting more N rich compost. Unless I’m misunderstanding the consequence here, this should be contributing to N rates in the soil, not diminishing soil N.

6 Regressions

keySoilVars <- c("ph", "calcium", "total.nitrogen", "organic.carbon", "magnesium")

6.1 Kenya models

keDat <- keDat %>% mutate(age2 = age^2)


indFeList <- list("as.factor(d_client)", 
                  c("as.factor(d_client)", "as.factor(sample_id)"),
                  c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)"),
                  c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)", "age", "age2"))


# run this in parallel to speed up the process
# load the data and variables and packages into the cluster
regFileKe <- "regFile_through2017.RData"
forceUpdate <- forceUpdateAll

if(!file.exists(regFileKe) || forceUpdate) {
library(parallel)
no_cores <- detectCores() - 1

cl <- makeCluster(no_cores, type="FORK")
clusterEvalQ(cl, "plm")
clusterExport(cl, "keDat")
clusterExport(cl, "keySoilVars")
clusterExport(cl, "indFeList")

indFeLoopKe <- parLapply(cl, indFeList, function(mod){
  lapply(keySoilVars, function(outcome){
    form = lm(reformulate(termlabels = mod, response = outcome), data=keDat)
    
    pdf(file=paste("output/ke2017/", paste0(outcome, paste(mod, collapse = "")), ".pdf", sep = "")) 
    print(plot(form))
    dev.off()
    
    form = plm(form, c("sample_id", "age", "age2"))
    
    rownames(form) = paste(rownames(form), outcome, sep = " ")
    return(form)
  })
  
})
stopCluster(cl)
save(indFeLoopKe, file=regFileKe)
} else {
  load(regFileKe)
}

And combine model outputs into tables for each model

modExportKe <- lapply(indFeLoopKe, function(models){
  do.call(rbind, models)
})


for(i in 1:length(modExportKe)){
  write.csv(modExportKe[i], file=paste0("output/ke2017/","regOutput_", i, ".csv"), row.names = T)
}

In the individual fixed effect model above, the naive model would only include a client indicator and individual fixed effects. If we add season, we lose significance on almost everything. I’d guess that as we add more likely controls we additionally lose significance. I’ve included age and age squared along the lines of Hicks et.al.

finalModelKe <- modExportKe[4]

kable(finalModelKe, format="markdown")
Coefficient 95% Confidence Interval P-Value
(Intercept) ph 5.5000 5 to 5.9 <0.001 ***
as.factor(d_client)1 ph -0.0340 -0.089 to 0.02 0.22
as.factor(season)2016 ph -0.3900 -0.41 to -0.37 <0.001 ***
(Intercept) calcium 470.0000 15 to 920 0.043 *
as.factor(d_client)1 calcium -66.0000 -120 to -10 0.02 *
as.factor(season)2016 calcium -50.0000 -75 to -24 <0.001 ***
as.factor(season)2017 calcium -28.0000 -54 to -1.7 0.037 *
(Intercept) total.nitrogen 0.1100 0.087 to 0.12 <0.001 ***
as.factor(d_client)1 total.nitrogen -0.0028 -0.005 to -0.00055 0.015 *
as.factor(season)2016 total.nitrogen -0.0091 -0.01 to -0.008 <0.001 ***
as.factor(season)2017 total.nitrogen -0.0110 -0.012 to -0.0099 <0.001 ***
(Intercept) organic.carbon 1.2000 0.92 to 1.5 <0.001 ***
as.factor(d_client)1 organic.carbon 0.0110 -0.026 to 0.048 0.55
as.factor(season)2016 organic.carbon -0.3100 -0.33 to -0.29 <0.001 ***
as.factor(season)2017 organic.carbon -0.3100 -0.32 to -0.29 <0.001 ***
(Intercept) magnesium 86.0000 14 to 160 0.019 *
as.factor(d_client)1 magnesium -5.8000 -14 to 2.5 0.17
as.factor(season)2016 magnesium -12.0000 -15 to -8.3 <0.001 ***
write.csv(finalModelKe, file="output/ke2017/indFe_ke2017.csv")

6.2 Rwanda models

The parallel model wasn’t running for some reason so I’m just going to run the one model I really want to look at and save those results.

rwDat <- rwDat %>% mutate(age2 = age^2)

rwDat <- do.call(data.frame,lapply(rwDat, function(x){ 
  replace(x, is.infinite(x),NA)
  }))
indFeList <- list("as.factor(d_client)", 
                  c("as.factor(d_client)", "as.factor(sample_id)"),
                  c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)"),
                  c("as.factor(d_client)", "as.factor(sample_id)", "as.factor(season)", "age", "age2"))


# run this in parallel to speed up the process
# load the data and variables and packages into the cluster
regFileRw <- "regFile_through2017b.RData"
#forceUpdate <- forceUpdateAll

if(!file.exists(regFileRw) || forceUpdate) {
library(parallel)
no_cores <- detectCores() - 1

cl <- makeCluster(no_cores, type="FORK")
clusterEvalQ(cl, "plm")
clusterExport(cl, "rwDat")
clusterExport(cl, "keySoilVars")
clusterExport(cl, "indFeList")

indFeLoopRw <- parLapply(cl, indFeList, function(mod){
  lapply(keySoilVars, function(outcome){
    
    form = lm(reformulate(termlabels = mod, response = outcome), data=rwDat)
    
    pdf(file=paste("output/rw2017b/", paste0(outcome, paste(mod, collapse = "")), ".pdf", sep = "")) 
    print(plot(form))
    dev.off()
    
    form = plm(form, c("sample_id", "age", "age2"))
    
    rownames(form) = paste(rownames(form), outcome, sep = " ")
    return(form)
  })
  
})
stopCluster(cl)
save(indFeLoopRw, file=regFileRw)
} else {
  load(regFileRw)
}

modExportRw <- lapply(indFeLoopRw, function(models){
  do.call(rbind, models)
})

for(i in 1:length(modExportRw)){
  write.csv(modExportRw[i], file=paste0("output/rw2017b/","regOutput_", i, ".csv"), row.names = T)
}

finalModelRw <- modExportRw[4]

kable(finalModelRw, format="markdown")
write.csv(finalModelRw, file="output/rw2017b/indFe_rw2017.csv")

7 Appendix

Nothing to see here